/* -*- c -*- */

/*
 * The purpose of this module is to add faster sort functions
 * that are type-specific.  This is done by altering the
 * function table for the builtin descriptors.
 *
 * These sorting functions are copied almost directly from numarray
 * with a few modifications (complex comparisons compare the imaginary
 * part if the real parts are equal, for example), and the names
 * are changed.
 *
 * The original sorting code is due to Charles R. Harris who wrote
 * it for numarray.
 */

/*
 * Quick sort is usually the fastest, but the worst case scenario is O(N^2) so
 * the code switches to the O(NlogN) worst case heapsort if not enough progress
 * is made on the large side of the two quicksort partitions. This improves the
 * worst case while still retaining the speed of quicksort for the common case.
 * This is variant known as introsort.
 *
 *
 * def introsort(lower, higher, recursion_limit=log2(higher - lower + 1) * 2):
 *   # sort remainder with heapsort if we are not making enough progress
 *   # we arbitrarily choose 2 * log(n) as the cutoff point
 *   if recursion_limit < 0:
 *       heapsort(lower, higher)
 *       return
 *
 *   if lower < higher:
 *      pivot_pos = partition(lower, higher)
 *      # recurse into smaller first and leave larger on stack
 *      # this limits the required stack space
 *      if (pivot_pos - lower > higher - pivot_pos):
 *          quicksort(pivot_pos + 1, higher, recursion_limit - 1)
 *          quicksort(lower, pivot_pos, recursion_limit - 1)
 *      else:
 *          quicksort(lower, pivot_pos, recursion_limit - 1)
 *          quicksort(pivot_pos + 1, higher, recursion_limit - 1)
 *
 *
 * the below code implements this converted to an iteration and as an
 * additional minor optimization skips the recursion depth checking on the
 * smaller partition as it is always less than half of the remaining data and
 * will thus terminate fast enough
 */

#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "npy_sort.h"
#include "npysort_common.h"
#include <stdlib.h>

#define NOT_USED NPY_UNUSED(unused)
/*
 * pushing largest partition has upper bound of log2(n) space
 * we store two pointers each time
 */
#define PYA_QS_STACK (NPY_BITSOF_INTP * 2)
#define SMALL_QUICKSORT 15
#define SMALL_MERGESORT 20
#define SMALL_STRING 16


/*
 *****************************************************************************
 **                            NUMERIC SORTS                                **
 *****************************************************************************
 */


/**begin repeat
 *
 * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
 *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
 *         CFLOAT, CDOUBLE, CLONGDOUBLE, DATETIME, TIMEDELTA#
 * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
 *         longlong, ulonglong, half, float, double, longdouble,
 *         cfloat, cdouble, clongdouble, datetime, timedelta#
 * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
 *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
 *         npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
 *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
 */

NPY_NO_EXPORT int
quicksort_@suff@(void *start, npy_intp num, void *NOT_USED)
{
    @type@ vp;
    @type@ *pl = start;
    @type@ *pr = pl + num - 1;
    @type@ *stack[PYA_QS_STACK];
    @type@ **sptr = stack;
    @type@ *pm, *pi, *pj, *pk;
    int depth[PYA_QS_STACK];
    int * psdepth = depth;
    int cdepth = npy_get_msb(num) * 2;

    for (;;) {
        if (NPY_UNLIKELY(cdepth < 0)) {
            heapsort_@suff@(pl, pr - pl + 1, NULL);
            goto stack_pop;
        }
        while ((pr - pl) > SMALL_QUICKSORT) {
            /* quicksort partition */
            pm = pl + ((pr - pl) >> 1);
            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
            if (@TYPE@_LT(*pr, *pm)) @TYPE@_SWAP(*pr, *pm);
            if (@TYPE@_LT(*pm, *pl)) @TYPE@_SWAP(*pm, *pl);
            vp = *pm;
            pi = pl;
            pj = pr - 1;
            @TYPE@_SWAP(*pm, *pj);
            for (;;) {
                do ++pi; while (@TYPE@_LT(*pi, vp));
                do --pj; while (@TYPE@_LT(vp, *pj));
                if (pi >= pj) {
                    break;
                }
                @TYPE@_SWAP(*pi,*pj);
            }
            pk = pr - 1;
            @TYPE@_SWAP(*pi, *pk);
            /* push largest partition on stack */
            if (pi - pl < pr - pi) {
                *sptr++ = pi + 1;
                *sptr++ = pr;
                pr = pi - 1;
            }
            else {
                *sptr++ = pl;
                *sptr++ = pi - 1;
                pl = pi + 1;
            }
            *psdepth++ = --cdepth;
        }

        /* insertion sort */
        for (pi = pl + 1; pi <= pr; ++pi) {
            vp = *pi;
            pj = pi;
            pk = pi - 1;
            while (pj > pl && @TYPE@_LT(vp, *pk)) {
                *pj-- = *pk--;
            }
            *pj = vp;
        }
stack_pop:
        if (sptr == stack) {
            break;
        }
        pr = *(--sptr);
        pl = *(--sptr);
        cdepth = *(--psdepth);
    }

    return 0;
}


NPY_NO_EXPORT int
aquicksort_@suff@(void *vv, npy_intp* tosort, npy_intp num, void *NOT_USED)
{
    @type@ *v = vv;
    @type@ vp;
    npy_intp *pl = tosort;
    npy_intp *pr = tosort + num - 1;
    npy_intp *stack[PYA_QS_STACK];
    npy_intp **sptr = stack;
    npy_intp *pm, *pi, *pj, *pk, vi;
    int depth[PYA_QS_STACK];
    int * psdepth = depth;
    int cdepth = npy_get_msb(num) * 2;

    for (;;) {
        if (NPY_UNLIKELY(cdepth < 0)) {
            aheapsort_@suff@(vv, pl, pr - pl + 1, NULL);
            goto stack_pop;
        }
        while ((pr - pl) > SMALL_QUICKSORT) {
            /* quicksort partition */
            pm = pl + ((pr - pl) >> 1);
            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm, *pl);
            if (@TYPE@_LT(v[*pr],v[*pm])) INTP_SWAP(*pr, *pm);
            if (@TYPE@_LT(v[*pm],v[*pl])) INTP_SWAP(*pm, *pl);
            vp = v[*pm];
            pi = pl;
            pj = pr - 1;
            INTP_SWAP(*pm, *pj);
            for (;;) {
                do ++pi; while (@TYPE@_LT(v[*pi], vp));
                do --pj; while (@TYPE@_LT(vp, v[*pj]));
                if (pi >= pj) {
                    break;
                }
                INTP_SWAP(*pi, *pj);
            }
            pk = pr - 1;
            INTP_SWAP(*pi,*pk);
            /* push largest partition on stack */
            if (pi - pl < pr - pi) {
                *sptr++ = pi + 1;
                *sptr++ = pr;
                pr = pi - 1;
            }
            else {
                *sptr++ = pl;
                *sptr++ = pi - 1;
                pl = pi + 1;
            }
            *psdepth++ = --cdepth;
        }

        /* insertion sort */
        for (pi = pl + 1; pi <= pr; ++pi) {
            vi = *pi;
            vp = v[vi];
            pj = pi;
            pk = pi - 1;
            while (pj > pl && @TYPE@_LT(vp, v[*pk])) {
                *pj-- = *pk--;
            }
            *pj = vi;
        }
stack_pop:
        if (sptr == stack) {
            break;
        }
        pr = *(--sptr);
        pl = *(--sptr);
        cdepth = *(--psdepth);
    }

    return 0;
}

/**end repeat**/


/*
 *****************************************************************************
 **                             STRING SORTS                                **
 *****************************************************************************
 */


/**begin repeat
 *
 * #TYPE = STRING, UNICODE#
 * #suff = string, unicode#
 * #type = npy_char, npy_ucs4#
 */

NPY_NO_EXPORT int
quicksort_@suff@(void *start, npy_intp num, void *varr)
{
    PyArrayObject *arr = varr;
    const size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
    @type@ *vp;
    @type@ *pl = start;
    @type@ *pr = pl + (num - 1)*len;
    @type@ *stack[PYA_QS_STACK], **sptr = stack, *pm, *pi, *pj, *pk;
    int depth[PYA_QS_STACK];
    int * psdepth = depth;
    int cdepth = npy_get_msb(num) * 2;

    /* Items that have zero size don't make sense to sort */
    if (len == 0) {
        return 0;
    }

    vp = malloc(PyArray_ITEMSIZE(arr));
    if (vp == NULL) {
        return -NPY_ENOMEM;
    }

    for (;;) {
        if (NPY_UNLIKELY(cdepth < 0)) {
            heapsort_@suff@(pl, (pr - pl) / len + 1, varr);
            goto stack_pop;
        }
        while ((size_t)(pr - pl) > SMALL_QUICKSORT*len) {
            /* quicksort partition */
            pm = pl + (((pr - pl)/len) >> 1)*len;
            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
            if (@TYPE@_LT(pr, pm, len)) @TYPE@_SWAP(pr, pm, len);
            if (@TYPE@_LT(pm, pl, len)) @TYPE@_SWAP(pm, pl, len);
            @TYPE@_COPY(vp, pm, len);
            pi = pl;
            pj = pr - len;
            @TYPE@_SWAP(pm, pj, len);
            for (;;) {
                do pi += len; while (@TYPE@_LT(pi, vp, len));
                do pj -= len; while (@TYPE@_LT(vp, pj, len));
                if (pi >= pj) {
                    break;
                }
                @TYPE@_SWAP(pi, pj, len);
            }
            pk = pr - len;
            @TYPE@_SWAP(pi, pk, len);
            /* push largest partition on stack */
            if (pi - pl < pr - pi) {
                *sptr++ = pi + len;
                *sptr++ = pr;
                pr = pi - len;
            }
            else {
                *sptr++ = pl;
                *sptr++ = pi - len;
                pl = pi + len;
            }
            *psdepth++ = --cdepth;
        }

        /* insertion sort */
        for (pi = pl + len; pi <= pr; pi += len) {
            @TYPE@_COPY(vp, pi, len);
            pj = pi;
            pk = pi - len;
            while (pj > pl && @TYPE@_LT(vp, pk, len)) {
                @TYPE@_COPY(pj, pk, len);
                pj -= len;
                pk -= len;
            }
            @TYPE@_COPY(pj, vp, len);
        }
stack_pop:
        if (sptr == stack) {
            break;
        }
        pr = *(--sptr);
        pl = *(--sptr);
        cdepth = *(--psdepth);
    }

    free(vp);
    return 0;
}


NPY_NO_EXPORT int
aquicksort_@suff@(void *vv, npy_intp* tosort, npy_intp num, void *varr)
{
    @type@ *v = vv;
    PyArrayObject *arr = varr;
    size_t len = PyArray_ITEMSIZE(arr)/sizeof(@type@);
    @type@ *vp;
    npy_intp *pl = tosort;
    npy_intp *pr = tosort + num - 1;
    npy_intp *stack[PYA_QS_STACK];
    npy_intp **sptr=stack;
    npy_intp *pm, *pi, *pj, *pk, vi;
    int depth[PYA_QS_STACK];
    int * psdepth = depth;
    int cdepth = npy_get_msb(num) * 2;

    /* Items that have zero size don't make sense to sort */
    if (len == 0) {
        return 0;
    }

    for (;;) {
        if (NPY_UNLIKELY(cdepth < 0)) {
            aheapsort_@suff@(vv, pl, pr - pl + 1, varr);
            goto stack_pop;
        }
        while ((pr - pl) > SMALL_QUICKSORT) {
            /* quicksort partition */
            pm = pl + ((pr - pl) >> 1);
            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
            if (@TYPE@_LT(v + (*pr)*len, v + (*pm)*len, len)) INTP_SWAP(*pr, *pm);
            if (@TYPE@_LT(v + (*pm)*len, v + (*pl)*len, len)) INTP_SWAP(*pm, *pl);
            vp = v + (*pm)*len;
            pi = pl;
            pj = pr - 1;
            INTP_SWAP(*pm,*pj);
            for (;;) {
                do ++pi; while (@TYPE@_LT(v + (*pi)*len, vp, len));
                do --pj; while (@TYPE@_LT(vp, v + (*pj)*len, len));
                if (pi >= pj) {
                    break;
                }
                INTP_SWAP(*pi,*pj);
            }
            pk = pr - 1;
            INTP_SWAP(*pi,*pk);
            /* push largest partition on stack */
            if (pi - pl < pr - pi) {
                *sptr++ = pi + 1;
                *sptr++ = pr;
                pr = pi - 1;
            }
            else {
                *sptr++ = pl;
                *sptr++ = pi - 1;
                pl = pi + 1;
            }
            *psdepth++ = --cdepth;
        }

        /* insertion sort */
        for (pi = pl + 1; pi <= pr; ++pi) {
            vi = *pi;
            vp = v + vi*len;
            pj = pi;
            pk = pi - 1;
            while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) {
                *pj-- = *pk--;
            }
            *pj = vi;
        }
stack_pop:
        if (sptr == stack) {
            break;
        }
        pr = *(--sptr);
        pl = *(--sptr);
        cdepth = *(--psdepth);
    }

    return 0;
}

/**end repeat**/


/*
 *****************************************************************************
 **                             GENERIC SORT                                **
 *****************************************************************************
 */


NPY_NO_EXPORT int
npy_quicksort(void *start, npy_intp num, void *varr)
{
    PyArrayObject *arr = varr;
    npy_intp elsize = PyArray_ITEMSIZE(arr);
    PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
    char *vp;
    char *pl = start;
    char *pr = pl + (num - 1)*elsize;
    char *stack[PYA_QS_STACK];
    char **sptr = stack;
    char *pm, *pi, *pj, *pk;
    int depth[PYA_QS_STACK];
    int * psdepth = depth;
    int cdepth = npy_get_msb(num) * 2;

    /* Items that have zero size don't make sense to sort */
    if (elsize == 0) {
        return 0;
    }

    vp = malloc(elsize);
    if (vp == NULL) {
        return -NPY_ENOMEM;
    }

    for (;;) {
        if (NPY_UNLIKELY(cdepth < 0)) {
            npy_heapsort(pl, (pr - pl) / elsize + 1, varr);
            goto stack_pop;
        }
        while(pr - pl > SMALL_QUICKSORT*elsize) {
            /* quicksort partition */
            pm = pl + (((pr - pl) / elsize) >> 1) * elsize;
            if (cmp(pm, pl, arr) < 0) {
                GENERIC_SWAP(pm, pl, elsize);
            }
            if (cmp(pr, pm, arr) < 0) {
                GENERIC_SWAP(pr, pm, elsize);
            }
            if (cmp(pm, pl, arr) < 0) {
                GENERIC_SWAP(pm, pl, elsize);
            }
            GENERIC_COPY(vp, pm, elsize);
            pi = pl;
            pj = pr - elsize;
            GENERIC_SWAP(pm, pj, elsize);
            /*
             * Generic comparisons may be buggy, so don't rely on the sentinels
             * to keep the pointers from going out of bounds.
             */
            for (;;) {
                do {
                    pi += elsize;
                } while (cmp(pi, vp, arr) < 0 && pi < pj);
                do {
                    pj -= elsize;
                } while (cmp(vp, pj, arr) < 0 && pi < pj);
                if (pi >= pj) {
                    break;
                }
                GENERIC_SWAP(pi, pj, elsize);
            }
            pk = pr - elsize;
            GENERIC_SWAP(pi, pk, elsize);
            /* push largest partition on stack */
            if (pi - pl < pr - pi) {
                *sptr++ = pi + elsize;
                *sptr++ = pr;
                pr = pi - elsize;
            }
            else {
                *sptr++ = pl;
                *sptr++ = pi - elsize;
                pl = pi + elsize;
            }
            *psdepth++ = --cdepth;
        }

        /* insertion sort */
        for (pi = pl + elsize; pi <= pr; pi += elsize) {
            GENERIC_COPY(vp, pi, elsize);
            pj = pi;
            pk = pi - elsize;
            while (pj > pl && cmp(vp, pk, arr) < 0) {
                GENERIC_COPY(pj, pk, elsize);
                pj -= elsize;
                pk -= elsize;
            }
            GENERIC_COPY(pj, vp, elsize);
        }
stack_pop:
        if (sptr == stack) {
            break;
        }
        pr = *(--sptr);
        pl = *(--sptr);
        cdepth = *(--psdepth);
    }

    free(vp);
    return 0;
}


NPY_NO_EXPORT int
npy_aquicksort(void *vv, npy_intp* tosort, npy_intp num, void *varr)
{
    char *v = vv;
    PyArrayObject *arr = varr;
    npy_intp elsize = PyArray_ITEMSIZE(arr);
    PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
    char *vp;
    npy_intp *pl = tosort;
    npy_intp *pr = tosort + num - 1;
    npy_intp *stack[PYA_QS_STACK];
    npy_intp **sptr = stack;
    npy_intp *pm, *pi, *pj, *pk, vi;
    int depth[PYA_QS_STACK];
    int * psdepth = depth;
    int cdepth = npy_get_msb(num) * 2;

    /* Items that have zero size don't make sense to sort */
    if (elsize == 0) {
        return 0;
    }

    for (;;) {
        if (NPY_UNLIKELY(cdepth < 0)) {
            npy_aheapsort(vv, pl, pr - pl + 1, varr);
            goto stack_pop;
        }
        while ((pr - pl) > SMALL_QUICKSORT) {
            /* quicksort partition */
            pm = pl + ((pr - pl) >> 1);
            if (cmp(v + (*pm)*elsize, v + (*pl)*elsize, arr) < 0) {
                INTP_SWAP(*pm, *pl);
            }
            if (cmp(v + (*pr)*elsize, v + (*pm)*elsize, arr) < 0) {
                INTP_SWAP(*pr, *pm);
            }
            if (cmp(v + (*pm)*elsize, v + (*pl)*elsize, arr) < 0) {
                INTP_SWAP(*pm, *pl);
            }
            vp = v + (*pm)*elsize;
            pi = pl;
            pj = pr - 1;
            INTP_SWAP(*pm,*pj);
            for (;;) {
                do {
                    ++pi;
                } while (cmp(v + (*pi)*elsize, vp, arr) < 0 && pi < pj);
                do {
                    --pj;
                } while (cmp(vp, v + (*pj)*elsize, arr) < 0 && pi < pj);
                if (pi >= pj) {
                    break;
                }
                INTP_SWAP(*pi,*pj);
            }
            pk = pr - 1;
            INTP_SWAP(*pi,*pk);
            /* push largest partition on stack */
            if (pi - pl < pr - pi) {
                *sptr++ = pi + 1;
                *sptr++ = pr;
                pr = pi - 1;
            }
            else {
                *sptr++ = pl;
                *sptr++ = pi - 1;
                pl = pi + 1;
            }
            *psdepth++ = --cdepth;
        }

        /* insertion sort */
        for (pi = pl + 1; pi <= pr; ++pi) {
            vi = *pi;
            vp = v + vi*elsize;
            pj = pi;
            pk = pi - 1;
            while (pj > pl && cmp(vp, v + (*pk)*elsize, arr) < 0) {
                *pj-- = *pk--;
            }
            *pj = vi;
        }
stack_pop:
        if (sptr == stack) {
            break;
        }
        pr = *(--sptr);
        pl = *(--sptr);
        cdepth = *(--psdepth);
    }

    return 0;
}
